In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
pd.set_option('max_columns', 100)
%matplotlib inline
In [2]:
COL_ANSWER = ['variacao_preco']
COLS_FEATURES = ['pontos', 'preco', 'media_pontos']
COLS_EXTRA = ['id', 'rodada']
COLS_OF_INTEREST = COLS_EXTRA + COLS_FEATURES + COL_ANSWER
In [3]:
df = pd.read_csv('../../../data/desafio_valorizacao/valorizacao_cartola_2018.csv')
print(df.shape)
df.head()
Out[3]:
In [4]:
df = df[COLS_OF_INTEREST]
print(df.shape)
df.head()
Out[4]:
In [5]:
df.isna().any()
Out[5]:
In [6]:
%%time
df_samples = pd.DataFrame([])
for rodada in range(1, 38):
df_rod_atual = df[df.rodada == rodada]
df_rod_prox = df[df.rodada == (rodada + 1)]
df_merge = df_rod_atual.merge(df_rod_prox[['id', 'variacao_preco']] , how='left', on='id', suffixes=('_atual', '_prox'))
df_merge = df_merge.dropna()
df_merge = df_merge[(df_merge.variacao_preco_atual != 0) & (df_merge.variacao_preco_prox != 0)]
df_samples = df_samples.append(df_merge)
print(df_samples.shape)
In [7]:
# random_player = np.random.choice(df.id.unique())
random_player = 83786
df_samples[df_samples.id == 83786].sort_values(by='rodada')
Out[7]:
In [8]:
df[df.id == random_player].sort_values(by='rodada')
Out[8]:
In [9]:
x = df_samples.drop(columns=COLS_EXTRA + ['variacao_preco_prox']).values
y = df_samples['variacao_preco_prox'].values.reshape(-1, 1)
print(x.shape, y.shape)
In [10]:
std = StandardScaler()
x = std.fit_transform(x)
In [11]:
reg = LinearRegression()
reg.fit(x, y)
Out[11]:
In [12]:
y_pred = reg.predict(x)
diff = y - y_pred
plt.figure(figsize=(20, 10))
plt.plot(diff, color='red')
Out[12]:
In [13]:
mean_squared_error(y, y_pred)
Out[13]:
In [14]:
r2_score(y, y_pred)
Out[14]:
In [15]:
diff.mean(), diff.std()
Out[15]:
In [16]:
print(reg.coef_)
print(reg.intercept_)